This document contains code that is long to display in the week’s slides. Unlike code in the slides, all examples below are executed so you can see the values.

You should use these programming supplements to follow along and try executing the example code yourself.

Variables and Namespaces

When you execute a statement like x <- 5 in R, you are creating an object in memory which holds the numeric value 5 and is referenced by the variable name “x”.

If you later ask R to do something like y <- x + 2, it will search sequentially through a series of namespaces until it finds a variable called “x”. Namespaces can be thoguht of as collections of labels pointing to places in memory. You can use R’s search() command to examine the ordered list of namespaces that R will search in for variables.

# Check the search path of namespaces
search()
## [1] ".GlobalEnv"        "package:stats"     "package:graphics" 
## [4] "package:grDevices" "package:utils"     "package:datasets" 
## [7] "package:methods"   "Autoloads"         "package:base"
# use ls() to list the objects in one of those namespaces
ls("package:stats")
##   [1] "acf"                  "acf2AR"               "add.scope"           
##   [4] "add1"                 "addmargins"           "aggregate"           
##   [7] "aggregate.data.frame" "aggregate.ts"         "AIC"                 
##  [10] "alias"                "anova"                "ansari.test"         
##  [13] "aov"                  "approx"               "approxfun"           
##  [16] "ar"                   "ar.burg"              "ar.mle"              
##  [19] "ar.ols"               "ar.yw"                "arima"               
##  [22] "arima.sim"            "arima0"               "arima0.diag"         
##  [25] "ARMAacf"              "ARMAtoMA"             "as.dendrogram"       
##  [28] "as.dist"              "as.formula"           "as.hclust"           
##  [31] "as.stepfun"           "as.ts"                "asOneSidedFormula"   
##  [34] "ave"                  "bandwidth.kernel"     "bartlett.test"       
##  [37] "BIC"                  "binom.test"           "binomial"            
##  [40] "biplot"               "Box.test"             "bw.bcv"              
##  [43] "bw.nrd"               "bw.nrd0"              "bw.SJ"               
##  [46] "bw.ucv"               "C"                    "cancor"              
##  [49] "case.names"           "ccf"                  "chisq.test"          
##  [52] "cmdscale"             "coef"                 "coefficients"        
##  [55] "complete.cases"       "confint"              "confint.default"     
##  [58] "confint.lm"           "constrOptim"          "contr.helmert"       
##  [61] "contr.poly"           "contr.SAS"            "contr.sum"           
##  [64] "contr.treatment"      "contrasts"            "contrasts<-"         
##  [67] "convolve"             "cooks.distance"       "cophenetic"          
##  [70] "cor"                  "cor.test"             "cov"                 
##  [73] "cov.wt"               "cov2cor"              "covratio"            
##  [76] "cpgram"               "cutree"               "cycle"               
##  [79] "D"                    "dbeta"                "dbinom"              
##  [82] "dcauchy"              "dchisq"               "decompose"           
##  [85] "delete.response"      "deltat"               "dendrapply"          
##  [88] "density"              "density.default"      "deriv"               
##  [91] "deriv3"               "deviance"             "dexp"                
##  [94] "df"                   "df.kernel"            "df.residual"         
##  [97] "DF2formula"           "dfbeta"               "dfbetas"             
## [100] "dffits"               "dgamma"               "dgeom"               
## [103] "dhyper"               "diffinv"              "dist"                
## [106] "dlnorm"               "dlogis"               "dmultinom"           
## [109] "dnbinom"              "dnorm"                "dpois"               
## [112] "drop.scope"           "drop.terms"           "drop1"               
## [115] "dsignrank"            "dt"                   "dummy.coef"          
## [118] "dummy.coef.lm"        "dunif"                "dweibull"            
## [121] "dwilcox"              "ecdf"                 "eff.aovlist"         
## [124] "effects"              "embed"                "end"                 
## [127] "estVar"               "expand.model.frame"   "extractAIC"          
## [130] "factanal"             "factor.scope"         "family"              
## [133] "fft"                  "filter"               "fisher.test"         
## [136] "fitted"               "fitted.values"        "fivenum"             
## [139] "fligner.test"         "formula"              "frequency"           
## [142] "friedman.test"        "ftable"               "Gamma"               
## [145] "gaussian"             "get_all_vars"         "getCall"             
## [148] "getInitial"           "glm"                  "glm.control"         
## [151] "glm.fit"              "hasTsp"               "hat"                 
## [154] "hatvalues"            "hclust"               "heatmap"             
## [157] "HoltWinters"          "influence"            "influence.measures"  
## [160] "integrate"            "interaction.plot"     "inverse.gaussian"    
## [163] "IQR"                  "is.empty.model"       "is.leaf"             
## [166] "is.mts"               "is.stepfun"           "is.ts"               
## [169] "is.tskernel"          "isoreg"               "KalmanForecast"      
## [172] "KalmanLike"           "KalmanRun"            "KalmanSmooth"        
## [175] "kernapply"            "kernel"               "kmeans"              
## [178] "knots"                "kruskal.test"         "ks.test"             
## [181] "ksmooth"              "lag"                  "lag.plot"            
## [184] "line"                 "lm"                   "lm.fit"              
## [187] "lm.influence"         "lm.wfit"              "loadings"            
## [190] "loess"                "loess.control"        "loess.smooth"        
## [193] "logLik"               "loglin"               "lowess"              
## [196] "ls.diag"              "ls.print"             "lsfit"               
## [199] "mad"                  "mahalanobis"          "make.link"           
## [202] "makeARIMA"            "makepredictcall"      "manova"              
## [205] "mantelhaen.test"      "mauchly.test"         "mcnemar.test"        
## [208] "median"               "median.default"       "medpolish"           
## [211] "model.extract"        "model.frame"          "model.frame.default" 
## [214] "model.matrix"         "model.matrix.default" "model.matrix.lm"     
## [217] "model.offset"         "model.response"       "model.tables"        
## [220] "model.weights"        "monthplot"            "mood.test"           
## [223] "mvfft"                "na.action"            "na.contiguous"       
## [226] "na.exclude"           "na.fail"              "na.omit"             
## [229] "na.pass"              "napredict"            "naprint"             
## [232] "naresid"              "nextn"                "nlm"                 
## [235] "nlminb"               "nls"                  "nls.control"         
## [238] "NLSstAsymptotic"      "NLSstClosestX"        "NLSstLfAsymptote"    
## [241] "NLSstRtAsymptote"     "nobs"                 "numericDeriv"        
## [244] "offset"               "oneway.test"          "optim"               
## [247] "optimHess"            "optimise"             "optimize"            
## [250] "order.dendrogram"     "p.adjust"             "p.adjust.methods"    
## [253] "pacf"                 "Pair"                 "pairwise.prop.test"  
## [256] "pairwise.t.test"      "pairwise.table"       "pairwise.wilcox.test"
## [259] "pbeta"                "pbinom"               "pbirthday"           
## [262] "pcauchy"              "pchisq"               "pexp"                
## [265] "pf"                   "pgamma"               "pgeom"               
## [268] "phyper"               "plclust"              "plnorm"              
## [271] "plogis"               "plot.ecdf"            "plot.spec.coherency" 
## [274] "plot.spec.phase"      "plot.stepfun"         "plot.ts"             
## [277] "pnbinom"              "pnorm"                "poisson"             
## [280] "poisson.test"         "poly"                 "polym"               
## [283] "power"                "power.anova.test"     "power.prop.test"     
## [286] "power.t.test"         "PP.test"              "ppoints"             
## [289] "ppois"                "ppr"                  "prcomp"              
## [292] "predict"              "predict.glm"          "predict.lm"          
## [295] "preplot"              "princomp"             "printCoefmat"        
## [298] "profile"              "proj"                 "promax"              
## [301] "prop.test"            "prop.trend.test"      "psignrank"           
## [304] "pt"                   "ptukey"               "punif"               
## [307] "pweibull"             "pwilcox"              "qbeta"               
## [310] "qbinom"               "qbirthday"            "qcauchy"             
## [313] "qchisq"               "qexp"                 "qf"                  
## [316] "qgamma"               "qgeom"                "qhyper"              
## [319] "qlnorm"               "qlogis"               "qnbinom"             
## [322] "qnorm"                "qpois"                "qqline"              
## [325] "qqnorm"               "qqplot"               "qsignrank"           
## [328] "qt"                   "qtukey"               "quade.test"          
## [331] "quantile"             "quasi"                "quasibinomial"       
## [334] "quasipoisson"         "qunif"                "qweibull"            
## [337] "qwilcox"              "r2dtable"             "rbeta"               
## [340] "rbinom"               "rcauchy"              "rchisq"              
## [343] "read.ftable"          "rect.hclust"          "reformulate"         
## [346] "relevel"              "reorder"              "replications"        
## [349] "reshape"              "resid"                "residuals"           
## [352] "residuals.glm"        "residuals.lm"         "rexp"                
## [355] "rf"                   "rgamma"               "rgeom"               
## [358] "rhyper"               "rlnorm"               "rlogis"              
## [361] "rmultinom"            "rnbinom"              "rnorm"               
## [364] "rpois"                "rsignrank"            "rstandard"           
## [367] "rstudent"             "rt"                   "runif"               
## [370] "runmed"               "rweibull"             "rwilcox"             
## [373] "rWishart"             "scatter.smooth"       "screeplot"           
## [376] "sd"                   "se.contrast"          "selfStart"           
## [379] "setNames"             "shapiro.test"         "sigma"               
## [382] "simulate"             "smooth"               "smooth.spline"       
## [385] "smoothEnds"           "sortedXyData"         "spec.ar"             
## [388] "spec.pgram"           "spec.taper"           "spectrum"            
## [391] "spline"               "splinefun"            "splinefunH"          
## [394] "SSasymp"              "SSasympOff"           "SSasympOrig"         
## [397] "SSbiexp"              "SSD"                  "SSfol"               
## [400] "SSfpl"                "SSgompertz"           "SSlogis"             
## [403] "SSmicmen"             "SSweibull"            "start"               
## [406] "stat.anova"           "step"                 "stepfun"             
## [409] "stl"                  "StructTS"             "summary.aov"         
## [412] "summary.glm"          "summary.lm"           "summary.manova"      
## [415] "summary.stepfun"      "supsmu"               "symnum"              
## [418] "t.test"               "termplot"             "terms"               
## [421] "terms.formula"        "time"                 "toeplitz"            
## [424] "ts"                   "ts.intersect"         "ts.plot"             
## [427] "ts.union"             "tsdiag"               "tsp"                 
## [430] "tsp<-"                "tsSmooth"             "TukeyHSD"            
## [433] "uniroot"              "update"               "update.default"      
## [436] "update.formula"       "var"                  "var.test"            
## [439] "variable.names"       "varimax"              "vcov"                
## [442] "weighted.mean"        "weighted.residuals"   "weights"             
## [445] "wilcox.test"          "window"               "window<-"            
## [448] "write.ftable"         "xtabs"

Dollarstore Calculator Math

# Addition with "+"
4 + 5
## [1] 9
# Subtraction with "-"
100 - 99
## [1] 1
# Multiplication with "*"
4 * 5
## [1] 20
# Division with "/"
15 / 3
## [1] 5
# Exponentiation with "^"
2^3
## [1] 8
# Order of Operations
4 * 5 + 5 / 5
## [1] 21
# Control with parentheses
4 * (5 + 5) / 5
## [1] 8

Data Structures

Vectors

  • Because R was designed for use with statistics, most of its operations are vectorized
  • You can create vectors a few ways:
# Ordered sequence of integers
1:5
## [1] 1 2 3 4 5
# Counting by 2s
seq(from = 0, to = 14, by = 2)
## [1]  0  2  4  6  8 10 12 14
# Replicate the same values
rep(TRUE, 6)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
# Concatenate multiple values with the "c" operator
c("all", "of", "the", "lights")
## [1] "all"    "of"     "the"    "lights"
# Watch out! Mixing types wil lead to silent coercion
c(1, TRUE, "hellos")
## [1] "1"      "TRUE"   "hellos"
# Some functions, when applied over a vector, return a single value
is.numeric(rnorm(100))
## [1] TRUE
# Others will return a vector of results
is.na(c(1, 5, 10, NA, 8))
## [1] FALSE FALSE FALSE  TRUE FALSE
# Vectors can be named
batting_avg <- c(
  youkilis = 0.300
  , ortiz = 0.355
  , nixon = 0.285
)
print(batting_avg)
## youkilis    ortiz    nixon 
##    0.300    0.355    0.285
# You can combine two vectors with c()
x <- c("a", "b", "c")
y <- c("1", "2", "3")
c(x, y)
## [1] "a" "b" "c" "1" "2" "3"

Lists

Vectors are the first multi-item data structure all R programmers learn. Soon, though, you may find yourself frustrated with the fact that they can only hold a single type. To handle casses where you want to package multiple types (and even multiple objects!) together, we will turn to a data structure called a list.

Capabilities Vectors Lists
Optional use of named elements
Support math operations like mean()
Hold multiple types
Hold multiple objects
# Create a list with list()
myList <- list(
  a = 1
  , b = rep(TRUE, 10)
  , x = c("shakezoola", "mic", "rulah")
)

# Examine it with str()
str(myList)
## List of 3
##  $ a: num 1
##  $ b: logi [1:10] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ x: chr [1:3] "shakezoola" "mic" "rulah"

Factors

Imagine that you want to build a model of the relationship between resource wealth and quality-of-life outcomes like life expectancy. You got out to the World Bank to grab some data, and the dataset you get includes a column called “region” with values like “Africa”, “European Union”, and “South America”. How can you use this variable in a model or for generating region-by-region summary stats? This is where R’s factor type comes in.

regions <- c("Africa", "European Union", "South America", "South America", "European Union")
region_fac <- as.factor(regions)

str(regions)
##  chr [1:5] "Africa" "European Union" "South America" "South America" ...
str(region_fac)
##  Factor w/ 3 levels "Africa","European Union",..: 1 2 3 3 2

What does it mean for region to be a factor? Essentially, a factor is a categorical variable. R uses a cool trick to save memory when storing factors…internally, R will convert factor values to integers and then keep aroudn a single table the tells it, e.g., that 1 = “Africa”, 2 = “European Union”, etc..

# Check it out! R has assigned integer values to the "region_fac" variable
as.integer(region_fac)
## [1] 1 2 3 3 2
# But you can also access the character values if you want
as.character(region_fac)
## [1] "Africa"         "European Union" "South America"  "South America" 
## [5] "European Union"
# For more, see:
str(region_fac)
##  Factor w/ 3 levels "Africa","European Union",..: 1 2 3 3 2
levels(region_fac)
## [1] "Africa"         "European Union" "South America"

Data Frames

Vectors and lists are crucial data structures in R, but you may find that they’re difficult to work with in model training and other data science tasks. It is now time to introduce a third foundational data structure: the data frame.

Data frames are tables of data. Each column of a data frame can be a different type, but all values within a column must be the same type.

# Create a data frame!
myDF <- data.frame(
    time_period    = c(1, 2, 3, 4, 5)
    , temperature  = c(25.6, 38.7, 31.4, 40.0, 29.20)
    , station      = c("A", "B", "A", "A", "B")
    , is_gov_owned = c(TRUE, FALSE, TRUE, TRUE, FALSE)
)

# Check out the structure of this thing
str(myDF)
## 'data.frame':    5 obs. of  4 variables:
##  $ time_period : num  1 2 3 4 5
##  $ temperature : num  25.6 38.7 31.4 40 29.2
##  $ station     : chr  "A" "B" "A" "A" ...
##  $ is_gov_owned: logi  TRUE FALSE TRUE TRUE FALSE

R comes with some sample data sets you can experiment with. Let’s load the mtcars data.frame and test out some new commands!

# Load the mtcars data frame
data("mtcars")

# Check out its structure
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# View the top 10 rows
head(mtcars, n = 10) # could use "tail" for the bottom 10
##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
# Find all the unique values of "cyl" (the number of engine cylinders)
unique(mtcars$cyl)
## [1] 6 4 8

Logical Operators

Often in your code, you’ll want to do/not do something or select / not select some data based on a logical condition (a statement that evaluates to TRUE or FALSE). Here are some examples of how to construct these statements in R.

# "and" logic is expressed with "&"
TRUE & TRUE   # TRUE
## [1] TRUE
TRUE & FALSE  # FALSE
## [1] FALSE
FALSE & FALSE # FALSE
## [1] FALSE
-5 < 5 & 3 > 2 # TRUE
## [1] TRUE
# "or" logic is expressed with "|"
TRUE | TRUE    # TRUE
## [1] TRUE
TRUE | FALSE   # TRUE
## [1] TRUE
FALSE | FALSE  # FALSE
## [1] FALSE
3 < 8 | 8 > 19 # TRUE
## [1] TRUE

The most common operators used to generate logicals are >, <, ==, and !=

# "equality" logic is specified with "=="
3 == 3   # TRUE
## [1] TRUE
4 == 4.1 # FALSE
## [1] FALSE
# "not" logic is specified with !. In a special case, != signifies "not equal"
!TRUE            # FALSE
## [1] FALSE
!FALSE           # TRUE
## [1] TRUE
! (TRUE | FALSE) # FALSE
## [1] FALSE
4 != 5           # TRUE
## [1] TRUE
# "greater than" and "less than" logic are specified in the way you might expect
5 < 5  # FALSE
## [1] FALSE
6 <= 6 # TRUE
## [1] TRUE
4 > 2  # TRUE
## [1] TRUE
3 >= 3 # TRUE
## [1] TRUE

As we learned in week two, you can use vectors of logicals (TRUE and FALSE) to subset other objects. As a general rule, when you put a vector on the left-hand side of a logical condition like == or >, you will get back a vector as a result.

# Load some data
data("mtcars")

# Create a logical index. Note that we get a VECTOR of logicals back
bigCarIndex <- mtcars$wt > 4

# Taking the SUM of a logical vector tells you the number of TRUEs.
# Taking the MEAN of a logical vector tells you the proportion of TRUEs
sum(bigCarIndex)
## [1] 4
mean(bigCarIndex)
## [1] 0.125

Subsetting

Subsetting Vectors

Subsetting is the act of retrieving a portion of an object, usually based on some logical condition (e.g. “all elements greater than 5”). In R, this is done with the [ operator.

# Create a vector to work with
myVec <- c(
  var1 = 10
  , var2 = 15
  , var3 = 20
  , av4 = 6
)

# "the first element"
myVec[1]
## var1 
##   10
# "second to fourth elements"
myVec[2:4]
## var2 var3  av4 
##   15   20    6
# "the element named var3"
myVec["var3"]
## var3 
##   20

Subsetting Lists

Lists, arbitrary collections of (potentially heterogeneous) R objects, are subsetted a bit differently than vectors. If you use [ with a list, you are guaranteed to get back another list. If you use [[, you will get back an object in its natural form (whatever it would look like if it wasn’t in the list). You can also use $ to access named elements.

# Set up a sample list object
someList <- list(
    country = "DEU"
    , region  = "EU"
    , econ_stats = list(
        gdp_growth = 1.5
        , rm10y    = 2.75
        , DAXmonthly = c(1000, 1100, 1050, 1200, 1500)
    )
)

# Examine the list structure
str(someList)
## List of 3
##  $ country   : chr "DEU"
##  $ region    : chr "EU"
##  $ econ_stats:List of 3
##   ..$ gdp_growth: num 1.5
##   ..$ rm10y     : num 2.75
##   ..$ DAXmonthly: num [1:5] 1000 1100 1050 1200 1500
# Grab the country name
country1 <- someList[["country"]]
country2 <- someList$country

class(country1)
## [1] "character"
class(country2)
## [1] "character"
identical(country1, country2)
## [1] TRUE
# Grab the economic stats as a list within a list
germanyStats <- someList["econ_stats"]
class(germanyStats)
## [1] "list"
# Grab the numeric vector of stock prices
daxPrices <- someList$econ_stats$DAXmonthly
class(daxPrices)
## [1] "numeric"
# Another way to parse down the list for stock prices
someList[["econ_stats"]][["DAXmonthly"]]
## [1] 1000 1100 1050 1200 1500

Subsetting Data Frames

Data frames are the workhorse data structure of statistics in R. The best way to learn data frame subsetting is to just walk through the examples below:

# Create a data frame
someDF <- data.frame(
    conference  = c("Big East", "Big Ten", "Big East", "ACC", "SEC")
    , school_name = c("Villanova", "Minnesota", "Marquette", "Duke", "LSU")
    , wins        = c(18, 14, 19, 24, 12)
    , ppg         = c(71.5, 45.8, 66.9, 83.4, 58.7)
)

# Grab the wins column (NOTE: will give you back a vector)
someDF[, "wins"]
## [1] 18 14 19 24 12
someDF[["wins"]]
## [1] 18 14 19 24 12
# Grab conference and wins columns
someDF[, c("conference", "wins")]
##   conference wins
## 1   Big East   18
## 2    Big Ten   14
## 3   Big East   19
## 4        ACC   24
## 5        SEC   12
# Grab the first 3 rows and the two numeric columns
someDF[1:3, c("wins", "ppg")]
##   wins  ppg
## 1   18 71.5
## 2   14 45.8
## 3   19 66.9

Using Logical Vectors

So far, we’ve seen how to subset R objects using numeric indices and named elements. These are useful approaches, but both require you to know something about the contents of the obejct you’re working with.

Using these methods (especially numeric indices like saying give me columns 2-4) can make your code confusing and hard for others to reason about. Wherever possible, I strongly recommend using logical vectors for subsetting. This makes your code intuitive and more flexible to change.

# Some data frame
playerDF <- data.frame(
    playerName   = c("Rajon Rondo", "Paul Pierce", "Kevin Garnett", "Ray Allen", "Big Baby")
    , position   = c("PG", "SF", "PF", "SG", "C")
    , apg        = c(14.2, 3.5, 4.6, 5.0, 0.7)
    , ppg        = c(4.8, 31.0, 24.6, 33.2, 12.3)
    , fg_percent = c(-.062, 0.48, 0.63, 0.80, 0.51)
)

# "Give me a data frame with all Rondo and KG's stats"
nameIndex <- playerDF$playerName %in% c("Rajon Rondo", "Kevin Garnett")
nameIndex
## [1]  TRUE FALSE  TRUE FALSE FALSE
class(nameIndex)
## [1] "logical"
playerDF[nameIndex,]
##      playerName position  apg  ppg fg_percent
## 1   Rajon Rondo       PG 14.2  4.8     -0.062
## 3 Kevin Garnett       PF  4.6 24.6      0.630
# "Give me apg for players who averaged more than 20 ppg"
playerDF[playerDF$ppg > 20, c("playerName", "apg")]
##      playerName apg
## 2   Paul Pierce 3.5
## 3 Kevin Garnett 4.6
## 4     Ray Allen 5.0
# "Give me stats for all the guards""
guardIndex <- grepl("G$", playerDF$position)
print(guardIndex)
## [1]  TRUE FALSE FALSE  TRUE FALSE
playerDF[grepl("G$", playerDF$position)]
##      playerName  ppg
## 1   Rajon Rondo  4.8
## 2   Paul Pierce 31.0
## 3 Kevin Garnett 24.6
## 4     Ray Allen 33.2
## 5      Big Baby 12.3
# "Give me stats for players who were guards OR show above 50% from the field"
playerDF[grepl("G$", playerDF$position) | playerDF$fg_percent > 0.5,]
##      playerName position  apg  ppg fg_percent
## 1   Rajon Rondo       PG 14.2  4.8     -0.062
## 3 Kevin Garnett       PF  4.6 24.6      0.630
## 4     Ray Allen       SG  5.0 33.2      0.800
## 5      Big Baby        C  0.7 12.3      0.510
# "Give me the column names that start with p"
names(playerDF)[grepl("^p", names(playerDF))]
## [1] "playerName" "position"   "ppg"

Controlling Program Flow

If-Else

Soon after you start writing code (in any language), you’ll inevitably find yourself saying “I only want to do this thing if certain conditions are met”. This type of logic is expressed using if-else syntax

library(gaussfacts)

# Set some variable
DAY <- "SATURDAY"
print(DAY)
## [1] "SATURDAY"
# If it's Monday, print 2 Gauss facts. Otherwise, print 1
if (DAY == "MONDAY"){
    gaussfact()
    gaussfact()
} else {
    gaussfact()
}
## Gauss can find a perfect cube larger than 8 in Fibonacci Sequence.

What if you want to express more than two possible outcomes? For this, we could use R’s else if construct to nest conditions. Note that conditional blocks can have any number of “else if” statements, but only one “else” block.

# Try to think through what this will do before you run it yourself:
if (4 > 5){
    print("3")
} else if (6 <= (5/10)) {
    print("1")
} else if (4 + 4 + 4 == 12.0001) {
    print("4")
} else {
    print("2")
}
## [1] "2"

For Loops

One of the most powerful characteristics of general purpose programming languages is their ability to automate repetitive tasks. When you know that you want to do something a fixed number of times (say, squaring each item in a vector), you can use a for loop.

# Create a vector
x <- c(1,4,6)

# Print the square of each element one at a time
print(1^2)
## [1] 1
print(4^2)
## [1] 16
print(6^2)
## [1] 36
# BETTER: Loop over the vector and print the square of each element
for (some_number in x){
    print(some_number^2)
}
## [1] 1
## [1] 16
## [1] 36

While Loops

For loops are suitable for many applications, but they can be too restrictive in some cases. For example, imagine that you are writing a simple movie search engine and you want to tell R “look through an alphabetized list of movies and tell me if you find Apocalypse Now”. A for loop can certainly do this, but it will keep running over ALL movies…long after it finds Ace Ventura! This is a great place to use a while loop.

Here is the for loop implementation:

movie_list <- c(
    "ace ventura"
    , "apocalypse now"
    , "return of the jedi"
    , "v for vendetta"
    , "zoolander"
)
MOVIE_TO_SEARCH_FOR <- "apocalypse now"

# Naive for loop implementation
i <- 1
for (movie in movie_list) {
    if (movie == MOVIE_TO_SEARCH_FOR) {
        print(paste0(i, ": found it!"))
    } else {
        print(paste0(i, ": not found"))
    }
    i <- i + 1
}
## [1] "1: not found"
## [1] "2: found it!"
## [1] "3: not found"
## [1] "4: not found"
## [1] "5: not found"

And here is the while loop implementation. Notice that this one stops checking once it finds what it wants. In this case, it’s more efficient to use a while loop to solve this problem.

movie_list <- c(
    "ace ventura"
    , "apocalypse now"
    , "return of the jedi"
    , "v for vendetta"
    , "zoolander"
)
MOVIE_TO_SEARCH_FOR <- "apocalypse now"

# Faster while loop implementation
KEEP_SEARCHING <- TRUE
i = 1
while (KEEP_SEARCHING){
    
    # Check this element
    if (movie_list[i] == MOVIE_TO_SEARCH_FOR) {
        print(paste0(i, ": found it!"))
        KEEP_SEARCHING <- FALSE
    } else {
        print(paste0(i, ": not found"))
    }
    
    # If we've reached the end, break out. Otherwise, increment the counter
    if (i == length(movie_list)){
        print("Done searching. This movie isn't in the list")
    } else {
        i <- i + 1
    }
}
## [1] "1: not found"
## [1] "2: found it!"

Functions

R is a functional programming language. To write powerful, concise code, you’ll need to master the use and creation of functions.

“If you find yourself copying and pasting the same code more than twice, it’s time to write a function.” - Hadley Wickham

# Function to return only the numbers smaller than 10
nums <- c(1, 3, 4, 8, 13, 24)

getLittleNumbers <- function(some_numbahs){
    lil_ones <- some_numbahs[some_numbahs < 10]
    return(lil_ones)
}

getLittleNumbers(nums)
## [1] 1 3 4 8

Required Arguments

  • R functions take 0 or more arguments…basically named variables that the function uses to do it’s work
  • Take a look at ?sqrt. You’ll see that it takes one argument, named x. You can pass any vector of numeric values to this argument and sqrt() will return the square root of each element
  • In this case, we’d say x is a required argument of sqrt()
# Take the square root of a vector of numbers
sqrt(x = c(1, 4, 9, 16, 25))
## [1] 1 2 3 4 5
# Note that calling this function without the argument will throw an error!
sqrt()
## Error in sqrt(): 0 arguments passed to 'sqrt' which requires 1

Default Argument Values

  • For more complicated functions, passing values to each argument can get burdensome
  • To handle this, R allows function authors to specify default values. These are values that certain arguments will take automatically unless you decide to overwrite them
  • Example: look at ?rnorm. You’ll see that this function’s signature reads rnorm(n, mean = 0, sd = 1).
# 100 random draws from a normal distribution w/ mean 0 and standard deviation 1
rand_nums <- rnorm(n = 100)

# 100 random draws from a normal distribution w/ mean 4.5 and standard deviation 1
rand_nums <- rnorm(n = 100, mean = 4.5)

Functions That Return Stuff

As you’ve seen in previous examples, the R special word return tells a function to “give back” some value. When you execute an expression like x <- someFunction(), that function’s return value (an R object) is stored a variable called “x”.

Unlike in some other programming languages, R allows you to use multiple return values inside the body of a function. The first time that the code inside the function reaches a return value, it will pass that value back out of the function and immediately stop executing the function.

# simple function + print debugging
simpFunction <- function(n){
    print("first print")
    if (n > 5){
      return(TRUE)
    }
    
    print("second print")
    if (n < 5){
      return(FALSE)
    }
    
    print("third print")
    return("TRALSE")
}

simpFunction(100)
## [1] "first print"
## [1] TRUE
simpFunction(5)
## [1] "first print"
## [1] "second print"
## [1] "third print"
## [1] "TRALSE"

Functions That Return Nothing

Not all functions have to return something! Sometimes you may want to create a function that just has some side effect like creating a plot, writing to a file, or print to the console.

These are called “null functions” and they’re common in scripting languages like R. By default, these functions return the R special value NULL.

printSentence <- function(theSentence){
    words <- strsplit(
      x = theSentence
      , split = " "
    )
    for (word in words){
        print(word)
    }
}

# Assigning to an object is irrelevant...this function doesn't return anything
x <- printSentence("Hip means to know, it's a form of intelligence")
## [1] "Hip"          "means"        "to"           "know,"        "it's"        
## [6] "a"            "form"         "of"           "intelligence"
x
## NULL

Scoping

Remember when we talked about namespaces and how R searches for objects? It’s time to extend that logic to functions…which is where things get a bit weird and hard to understand.

R uses a search technique called lexical scoping

# Define an object + function in the global environment
x <- 4
someFunc <- function(x){
    return(x^2)
}

# If you call someFunc without an argument, it will go up one environment and look
# for "X" in the global environment
someFunc(x)
## [1] 16
# If you pass in a new value, it will use that instead. Note that the value of
# x in the global environment remains unchanged. 
someFunc(x = 6)
## [1] 36
x
## [1] 4

Looping with lapply

R’s *apply family of functions are a bit difficult to understand at first, but soon you’ll come to love them. They make your code more expressive, flexible, and parallelizable (more on that final point later). One of the most popular is lapply() (“list apply”), which loops over a thing (e.g. vector, list) and returns a 1-level list. Let’s try it out:

# Create a list
studentList <- list(
  kanye = c(80, 90, 100)
  , talib = c(95, 85, 99)
  , common = c(100, 100, 99)
)

# Better way with lapply
grades <- lapply(studentList, mean)

Looping with sapply

You’ve seen how to loop over a vector/list and get back a list of function results. This may not be appropriate for some settings. Remember that you cannot execute statistical operations like mean() over a list. For that, we’d probably prefer to have a vector of results. This is where R’s sapply() (“simplified apply”) comes in. sapply() works the same way that lapply() does but returns a vector. Try it for yourself:

# Get some data
data("ChickWeight")
weights <- ChickWeight$weight

# Loop over and encode "above mean" and "below mean"
the_mean <- mean(weights)
meanCheck <- function(val, ref_mean){
    if (val > ref_mean){
        return("above mean")
    }
    if (val < ref_mean){
        return("below mean")
    }
    return("equal to the mean")
}
check_vec <- sapply(
    X = weights
    , FUN = function(x){
        meanCheck(val = x, ref_mean = the_mean)
    }
)

Looping with apply

When analyzing real-world datasets, you may want to use the same looping convention we’ve been discussing, but apply it over many items and the get some summary (such as the median) of the results. This is where R’s apply() function comes in!

apply() allows you to loop over the rows or columns of a data frame and execute an arbitrary function. The code below holds some examples of what can be accomplished with apply().

# Get some data
data("ChickWeight")

# Calculate column-wise range
cwRanges <- apply(
    X = ChickWeight
    , MARGIN = 2
    , FUN = function(x){
        range(as.numeric(x))
    }
)

# Calculate row-wise range
rwRanges <- apply(
    X = ChickWeight
    , MARGIN = 1
    , FUN = function(blah){
        range(as.numeric(blah))
    }
)

Debugging

Using External Packages

This example shows some interesting data visualizations you can make using {data.table}, {quantmod}, and {rbokeh}.

Install these packages:

install.packages(c("data.table", "quantmod", "rbokeh"))
``


```r
# Load dependencies
library(data.table)
library(quantmod)
library(rbokeh)

# Get data and plot it
quantmod::getSymbols('CPIAUCSL', src = 'FRED')
## [1] "CPIAUCSL"
cpiDT <- data.table::data.table(
    CPIAUCSL
    , keep.rownames = TRUE
)

# Plot it!
rbokeh::figure(
    data = cpiDT
    , title = "U.S. CPI"
    , ylab = "Index (1982-1984 = 100)"
    , xlab = "date"
) %>% ly_lines(x = cpiDT$index, y = cpiDT$CPIAUCSL, color = "blue")

Sourcing Helper Functions

In the examples so far, I’ve been defining functions and using them in the same breath. As your projects grow in complexity, you will find this really tedious and hard to keep track of.

An alternative that many intermediate R programmers turn to is defining one or more helperfuns.R scripts to store custom functions and then using source() at the top of their run scripts to make those functions available in the main program.

Let’s create two scripts.

script 1: helperfuns.R

# Define some arbitrary custom function
myFunction <- function(n){
    if (n <= 5){
        return(n^2)
    } else {
        return(n^3)
    }
}

script 2: run_script.R

# uncomment the line below to source in the helper functions file
#source("helperfuns.R")

# Apply that function over some data
le_stuff <- sapply(c(1:20), myFunction)

# Plot the result
plot(x = 1:20, y = le_stuff)

Working with Files

File Paths

Whenever you find yourself reading data into R or writing data out of it, you will need to work with file paths. File paths are just addresses on your computer’s file system. These paths can either be relative (expressed as steps above/below your current location) or absolute (full addresses).

All relative paths in R are relative to your working directory, a single location that you can set and reset any time in your session.

# Check and then change the current working directory
print(getwd())
sandbox_repo <- file.path(Sys.getenv("HOME"), "repos", "sandbox")
setwd(sandbox_repo)

# Reference a file with a full path
myDF <- read.csv(
    file = file.path(
        sandbox_repo
        , "data"
        , "some_data.csv"
    )
)

# Reference a file with a relative path
myDF <- read.csv(file = "./data/some_data.csv")

R provides a few other utilities for working with file paths and directory structures.

  • file.path(): create a filepath from multiple parts
  • file.exists(): returns TRUE is a file exists and FALSE if it doesn’t
  • list.files(): get a character vector with names of all files found in a directory
  • dir.exists(): returns TRUE is a directory exists and FALSE if it doesn’t
  • dir.create(): create a new directory
# List all the files in some directory and put the list in a vector
docs_dir <- file.path(
    Sys.getenv("HOME")
    , "some_folder"
    , "docs"
)
theFiles <- list.files(path = docs_dir)

# Create a directory if it doesn't exist
slide_dir <- file.path(docs_dir, "slides")
if (!dir.exists(slide_dir)) {
    dir.create(slide_dir)
}

# Check if a file exists
report_file <- file.path(docs_dir, "report.xlsx")
myFileExists <- file.exists(report_file)

Downloading files from the internet

To download files hosted on the internet, you can use download.file().

download.file(
  url = "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/iris.csv"
  , destfile = "iris.csv"
)

CSV

CSV stands for “comma-separated values”. This format is a really common to share small, tabular datasets because it is just a text file, and can be ready by many different types of programs. R has several options for reading this type of file.

  • read.csv(): base R function for reading CSVs into a data.frame
  • data.table::fread(): super-fast CSV reader that creates a special type of data.frame called a data.table
  • read.delim(): similar to read.csv(), but can read files with any delimiter
  • readr::read_csv(): CSV reader from RStudio

A CSV is just a plaintext file where the first row is column names separated by columns and each following row is an observation with columns separated by commas.

date,open,close
01-01-2012,10.5,8.9
01-02-2012,8.9,10.3

To begin this example, download the example file from the course repository.

iris_url <- "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/iris.csv"
iris_file <- "iris.csv"
download.file(
  url = iris_url
  , destfile = iris_file
)

Then read it in with read.csv() and inspect it with head().

irisDF <- read.csv(
  file = iris_file
  , header = TRUE
)
print(head(irisDF))

This function can also read CSV data directly from files on the internet.

irisDF <- read.csv(
  file = iris_url
  , header = TRUE
)
print(head(irisDF))

Excel

In the Economics / Business world (and many other areas!), Microsoft Excel is pretty much unavoidable. You’ll get data from the internet, your colleagues, clients, etc. in Excel format and may want to work with it in R. There are a few packages for doing this, but in this course we’ll focus on openxlsx.

NOTE: This package requires certain Java components that you may not have on your machine. If you run into issues, I recommend 1) installing an updated version of JRE or 2) exploring other packages like xlsx or readxl.

To begin this example, download the example file from the course repository.

gdp_url <- "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/gdp.xlsx"
gdp_file <- "gdp.xlsx"
download.file(
  url = gdp_url
  , destfile = gdp_file
)

Then read it in with openxlsx::read.xlsx() and view it with head().

library(openxlsx)
gdpDF <- openxlsx::read.xlsx(
    xlsxFile = gdp_file
)
print(head(gdpDF))

This function can also read Excel data directly from files on the internet.

library(openxlsx)
gdpDF <- openxlsx::read.xlsx(
    xlsxFile = gdp_url
)
print(head(gdpDF))

You can also use this package to write Excel files. You can do really complicated stuff (like conditional formatting, named ranges, and live formulas) from inside of R. It’s tough to set up at first, but can be VERY useful if you find yourself spending a lot of time running routine reports whose format is the same from update to update.

# load mtcars
data("mtcars")

# create a workbook object in R
testWB <- openxlsx::createWorkbook()

# Add sheets and data
openxlsx::addWorksheet(
    wb = testWB
    , sheetName = "car_data"
)
openxlsx::writeData(
    wb = testWB
    , sheet = "car_data"
    , x = mtcars
)

# Write out the file
testing_file <- file.path(Sys.getenv("HOME"), "testing.xlsx")
openxlsx::saveWorkbook(
    wb = testWB
    , file = testing_file
)

JSON

JSON (“Javascript Object Notation”) is a standard format for “semi-structured” or “nested” data. It’s a plain-text format that can be used by many programs and programming languages.

{
  "status": 200,
  "data": [
    {"customer_name": "Lupe", "purchases": 10},
    {"customer_name": "Wale", "purchases": 30}
  ]
}

To begin this example, download the example file from the course repository.

tweet_url <- "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/tweets.json"
tweet_file <- "tweets.json"
download.file(
  url = tweet_url
  , destfile = tweet_file
)

Open the file in a text editor and inspect its contents. It contains a sample response from the Twitter API, which is used by developers to build applications that interact with Twitter. Here’s a preview:

{
  "created_at": "Mon May 06 20:01:29 +0000 2019",
  "id": 1125490788736032800,
  "id_str": "1125490788736032770",
  "text": "Today's new update means that you can finally add Pizza Cat to your Retweet with comments! Learn more about this ne… https://t.co/Rbc9TF2s5X",
  "display_text_range": [
    0,
    140
  ],
  "truncated": true,
  "in_reply_to_status_id": null,
  "in_reply_to_status_id_str": null,
  "in_reply_to_user_id": null,
  "in_reply_to_user_id_str": null,
  "in_reply_to_screen_name": null,
  "user": {
    "id": 2244994945,
    "id_str": "2244994945",
    "name": "Twitter Dev",
    "screen_name": "TwitterDev",
    "location": "Internet",
    "url": "https://developer.twitter.com/",
    "description": "Your official source for Twitter Platform news, updates & events. Need technical help? Visit https://twittercommunity.com/ ⌨️ #TapIntoTwitter",
    "translator_type": "null"
  }
}

There are a few packages for reading this type of data in R. The example below uses one of the most popular, {jsonlite}. Read in the file with jsonlite::fromJSON() and view its contents with str().

tweetList <- jsonlite::fromJSON(
  txt  = tweet_file
  , simplifyDataFrame = FALSE
)

str(tweetList, max.level = 3)

This function can also read JSON data directly from files on the internet.

tweetList <- jsonlite::fromJSON(
  txt  = tweet_url
  , simplifyDataFrame = FALSE
)
str(tweetList, max.level = 3)

Unstructured Text files

In the context of statistical programming, text files that are “unstructured” are those that do not have an obvious parallel in a data object like a data frame, vector, list, or key-value store. An example might be a large archive of tweets or a collection of court transcripts.

Working with these types of files typically involves reading them into R line-by line and then using something called regular expression(basically just pattern-matching on strings) to extract relevant features like word counts. For example:

# Parameters
shakespeare_url <- "https://ia800300.us.archive.org/5/items/shakespearessonn01041gut/wssnt10.txt"

# Read all the text into a vector (one line per element)
text_vec <- readLines(con = shakespeare_url)

# Examine a few random lines:
sample(text_vec, 5)
## [1] "  Thou shouldst print more, not let that copy die."
## [2] "And situation with those dancing chips,"           
## [3] "Nor can I fortune to brief minutes tell,"          
## [4] "  Hence, thou suborned informer! a true soul"      
## [5] "#2 in our series by William Shakespeare"
# Coerce everything to lower-case
text_vec <- tolower(text_vec)

# Grab just the lines with the word "thou" or "twas""
thou_art_my_vector <- text_vec[grepl(pattern = "thou|twas", text_vec)]

# Print the count of lines w/ those words
lines_with_fancy_words <- length(thou_art_my_vector)
thing_to_print <- paste0(
    "of the "
    , length(text_vec)
    , " lines of text in all of Shakespeare's sonnet, "
    , lines_with_fancy_words
    , " used thou or twas."
)
print(thing_to_print)
## [1] "of the 2921 lines of text in all of Shakespeare's sonnet, 307 used thou or twas."

Missing Values

NA is a special object in R, used to capture the idea of “a value whose value is unknown”. Confusing, right? We’re going to go through a few examples to get you feeling comfortable with missing values. They’re an inevitability in real-world data.

PRO TIP: See ?NA for R’s documentation on the nuances of NA

# Create a vector w/ missing data
some_nums <- c(1,2,NA, 6, NA, 8)
print(some_nums)
## [1]  1  2 NA  6 NA  8
# Use is.na() to get a vector of TRUE/FALSE for the question "is this element NA?"
is.na(some_nums)
## [1] FALSE FALSE  TRUE FALSE  TRUE FALSE
# Confirm that even w/ NAs, R still knows this is a numeric vector
class(some_nums)
## [1] "numeric"

Strategy 1: Total Eradication

The first approach you may take to dealing with NA values is to simply drop them from your data. If you don’t think these missing data have any business value and your dataset is big enough that you can afford to drop some rows / columns, this is the right move for you.

# Removing NAs for vectors
top5 <- c(
  "Wale"
  , "Chance"
  , NA
  , "Lupe Fiasco"
  , "Shad"
  , "Kanye"
  , NA
)
print(top5)
## [1] "Wale"        "Chance"      NA            "Lupe Fiasco" "Shad"       
## [6] "Kanye"       NA
top5cleaned <- top5[!is.na(top5)]
print(top5cleaned)
## [1] "Wale"        "Chance"      "Lupe Fiasco" "Shad"        "Kanye"
# Removing rows with ANY NAs for data.frames
myDF <- data.frame(
    x = c(1, 2, NA, 4)
    , y = c(NA, TRUE, FALSE, TRUE)
    , z = c("hey", "there", NA, "friends")
)
cleanDF <- myDF[complete.cases(myDF), ]

Strategy 2: Handle on Subsets

You may find the “remove all the NAs everywhere” strategy a bit too aggreesive for your use case. If you have a 100-variable dataset and a single variable (column) is 90% NA values, do you really want to drop every row where that variable is NA? A better approach might be to selectively subset out columns where missing values are most severe before using complete.cases() to remove rows.

# Create a data frame where some variable have more NAs than others
testDF <- data.frame(
    var1 = sample(c(rnorm(99), NA), 200, replace = TRUE)
    , var2 = sample(c(rnorm(50), rep(NA, 50)), 200, replace = TRUE)
    , var3 = sample(c(rnorm(5), rep(NA, 95)), 200, replace = TRUE)
)

# Find columns that are more than 90% missing values
.percent_na <- function(a_vector){
    return(sum(is.na(a_vector)/length(a_vector)))
}
colsToDrop <- apply(testDF, MARGIN = 2, .percent_na) > 0.9
cleanDF <- testDF[, !colsToDrop]

# Remove rows w/ remaining NAs
cleanDF <- cleanDF[complete.cases(cleanDF),]

Strategy 3: Imputation

A final strategy, particularly useful in modeling contexts, is to use some imputation strategy to replace NA values with reasonable alternatives. One common approach (and my favorite), the roughfix method. It works like this: - For numeric columns, replace NAs with the column median - For categorical columns, replace NAs with the most common value

# Create a data frame where some variable have more NAs than others
testDF <- data.frame(
  var1 = sample(c(rnorm(99), NA), 500, replace = TRUE)
  , var2 = sample(c(rnorm(70), rep(NA, 30)), 500, replace = TRUE)
  , var3 = sample(c(rnorm(85), rep(NA, 15)), 500, replace = TRUE)
)

# Clean up w/ roughfix
library(randomForest)
cleanDF <- randomForest::na.roughfix(testDF)

Plotting

The Base Plotting System

R is famous, in part, for its ability to create production-quality plots within the default graphics package it ships with. This plotting paradigm is often referred to as “the base plotting system”, and we’re going to walk through a few examples of it this week.

The essential idea of the base plotting system is to build up plots in layers. You first create a simple 1-variable line plot, for example, then “add on” a legend, more variables, other plot types, etc. We’ll try a few examples using the sample data created below.

# Load up the famous iris dataset
data("iris")
head(iris, n = 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa

Let’s start with a simple line plot to answer the question are sepal length and sepal width related?.

# Create a simple line plot
plot(
  x = iris$Sepal.Length
  , y = iris$Sepal.Width
  , type = "p"
)

# Try again, but with labels!
plot(
  x = iris$Sepal.Length
  , y = iris$Sepal.Width
  , main = "My Second R plot!"
  , xlab = "sepal length"
  , ylab = "sepal width"
  , type = "p"
)

# Try it AGAIN, this time coloring by species and a legend
plot(
  x = iris$Sepal.Length
  , y = iris$Sepal.Width
  , main = "My Third R plot!"
  , xlab = "sepal length"
  , ylab = "sepal width"
  , type = "p"
  , col = iris$Species
  , bg = iris$Species
  , pch = 21
)

# Add a legend
legend(
  x = 7
  , y = 4.3
  , unique(iris$Species)
  , col = 1:length(iris$Species)
  , pch = 1
)

The base plotting system can be a great tool for quick exploratory analysis of data, such as examination of the distribution of variables in your data.

# Minimal Histogram
hist(iris$Petal.Length)

# Better histogram
hist(
  iris$Petal.Length
  , main = "Distribution of petal length"
  , xlab = "petal length"
  , breaks = 25
)

# Empirical density
plot(
  density(iris$Petal.Length)
  , main = "Empirical density of petal length"
  , col = "blue"
)

You can add more than one variable to these plots! Let’s compare the densities of Sepal length by species

# Overlay densities of Petal length by species
plot(
  density(iris[iris$Species == "setosa", "Petal.Length"])
  , main = "Empirical density of petal length"
  , col = "blue"
  , xlim = c(0, 7)
  , ylim = c(0, 2.5)
)
lines(density(iris[iris$Species == "versicolor", "Petal.Length"]), col = "red")
lines(density(iris[iris$Species == "virginica", "Petal.Length"]), col = "black")

# Add a legend
legend(
  x = 5.5
  , y = 2.25
  , legend = unique(iris$Species)
  , col = c("blue", "red", "black")
  , pch = 1
)

You can control the plotting options to make a grid of plots. The code below creates a 2x2 grid with a density for Sepal Width and scatter plots of the other three variables against sepal width.

# Set global options
par(mfrow = c(2,2))

# Dump in some plots
plot(
  density(iris$Sepal.Width)
  , main = "Empirical density of petal length"
  , col = "red"
)
plot(
  x = iris$Sepal.Width
  , y = iris$Sepal.Length
  , col = iris$Species
  , bg = iris$Species
  , pch = 21
)
plot(
  x = iris$Sepal.Width
  , y = iris$Petal.Length
  , col = iris$Species
  , bg = iris$Species
  , pch = 21
)
plot(
  x = iris$Sepal.Width
  , y = iris$Petal.Width
  , col = iris$Species
  , bg = iris$Species
  , pch = 21
)

# reset options
par(mfrow = c(1,1))

A Note On Graphics Devices

When R (or any other program!) creates plots, it needs to know where to put them! When you call plot() or other commands from within and RStudio session, the default is to just display the resulting figure in the “Plots” pane. However, you can use other graphics devices (places to put visual output) to tell R to put your figures elsewhere.

# Create 10 plots in a loop
outDir <- getwd()
for (i in 1:10){
    # Open a connection to a .png file
    filePath <- paste0(outDir, "/plot_", i, ".png")
    png(filePath)
    
    # Write out a plot to that file
    plot(x = rnorm(100), y = rnorm(100))
    
    # Close the connection to that file
    dev.off()
}

Statistical Analysis

R Was Made for Statistics!

In this section, we’re going to walk through all the steps of basic statistical analysis: getting data, exploring it, creating features, building models, evaluating models, and comparing those models’ performance.

Getting and Splitting the Data

We’re going to work with R’s swiss dataset. This cross-sectional dataset contains measures of fertility and some economic indicators collected in Switzerland in 1888. Our first task will be to load the data and immediately hold out a piece of it as testing data. The idea here is that when we evaluate the performance of our models later on, it will be better to do it on data that the models haven’t seen. This will give a more honest picture of how well they might perform on new data.

In this exercise, we’re going to evaluate the following question: Can we predict fertility based on regional socioeconomic characteristics?

# Load in some data
data("swiss")
# Research: What do the variables mean?
?swiss
# Test vs. Training Data: Let's hold out 40 random records right now for testing
testIndx    <- sample(1:nrow(swiss), size = 10, replace = FALSE)
swissTestDF <- swiss[testIndx,]
swiss       <- swiss[-testIndx,]

Examine Your Training Data

Once you’ve split your data into training and test, you should start poking it a bit to see if you find anything interesting. You can use str() to view the contents of your data objects, summary() to view some basic summary statistics on data frame columns, and cor() to get a correlation matrix between all pairs of numeric variables.

# Look at the structure
str(swiss)
## 'data.frame':    37 obs. of  6 variables:
##  $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.9 64.1 ...
##  $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 45.2 62 ...
##  $ Examination     : int  15 6 5 12 17 9 16 14 16 21 ...
##  $ Education       : int  12 9 5 7 15 7 7 8 13 12 ...
##  $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
##  $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 24.4 16.5 ...
# Tables of summary stats
summary(swiss)
##    Fertility      Agriculture     Examination     Education    
##  Min.   :35.00   Min.   : 1.20   Min.   : 3.0   Min.   : 1.00  
##  1st Qu.:64.40   1st Qu.:34.00   1st Qu.:12.0   1st Qu.: 7.00  
##  Median :70.50   Median :50.90   Median :16.0   Median : 9.00  
##  Mean   :69.87   Mean   :49.77   Mean   :16.3   Mean   :12.27  
##  3rd Qu.:79.30   3rd Qu.:67.80   3rd Qu.:22.0   3rd Qu.:13.00  
##  Max.   :92.50   Max.   :89.70   Max.   :37.0   Max.   :53.00  
##     Catholic      Infant.Mortality
##  Min.   :  2.15   Min.   :10.80   
##  1st Qu.:  6.10   1st Qu.:18.10   
##  Median : 18.46   Median :20.20   
##  Mean   : 45.46   Mean   :19.99   
##  3rd Qu.: 93.40   3rd Qu.:22.20   
##  Max.   :100.00   Max.   :26.60
# Correlation matrix
round(cor(swiss), 2)
##                  Fertility Agriculture Examination Education Catholic
## Fertility             1.00        0.40       -0.67     -0.73     0.44
## Agriculture           0.40        1.00       -0.72     -0.66     0.47
## Examination          -0.67       -0.72        1.00      0.77    -0.62
## Education            -0.73       -0.66        0.77      1.00    -0.24
## Catholic              0.44        0.47       -0.62     -0.24     1.00
## Infant.Mortality      0.37       -0.08       -0.11     -0.12     0.07
##                  Infant.Mortality
## Fertility                    0.37
## Agriculture                 -0.08
## Examination                 -0.11
## Education                   -0.12
## Catholic                     0.07
## Infant.Mortality             1.00

Hypothesis Testing

Though orthodox hypothesis tests have been maligned in the scientific press recently, they are still enormously valuable tools for discovering differences in datasets and evaluates relationships between variables.

One approach to looking for statistically interesting features (right-hand-side variables) involves binning those variables into “above median” and “below median”, and using a paired t-test to see whether or not the variable is statistically related to the target.

# t-tests

# 1. Is fertility very different in provinces with above-median % of men in Agriculture
swiss$majority_agg <- swiss$Agriculture > median(swiss$Agriculture)
t.test(Fertility ~ majority_agg, data = swiss)
## 
##  Welch Two Sample t-test
## 
## data:  Fertility by majority_agg
## t = -1.5989, df = 29.458, p-value = 0.1205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -15.475664   1.890284
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            66.56842            73.36111
# 2. Let's create this feature for every variable (we'll re-use it)

    #=== a. Drop that majority_agg feature
    swiss$majority_agg <- NULL

    #=== b. Create the binary columns
    x_var_names <- names(swiss)[names(swiss) != "Fertility"]
    for (var_name in x_var_names){

        bin_var_name <- paste0(var_name, "_above_median")
        swiss[, bin_var_name] <- swiss[, var_name] > median(swiss[, var_name])

        # Remember to give the test set these new features!
        swissTestDF[, bin_var_name] <- swissTestDF[, var_name] > median(swiss[, var_name])
    }

We can use lapply() to apply the same test to every variable in our data frame.

# 3. Could just loop over every non-Fertility column and do this test!
bin_var_names <- grep("_above_median", names(swiss), value = TRUE)
t_tests <- lapply(bin_var_names, function(bin_var){
    t_test <- t.test(
        Fertility ~ get(bin_var)
        , data = swiss
    )
    p_val  <- t_test$p.value 
    return(data.frame(
        signal = bin_var
        , p_val = round(t_test$p.value, 2)
        , t_stat = round(t_test$statistic, 2)
    ))
})
tDT <- data.table::rbindlist(t_tests)
tDT
##                           signal p_val t_stat
## 1:      Agriculture_above_median  0.12  -1.60
## 2:      Examination_above_median  0.00   3.86
## 3:        Education_above_median  0.00   3.67
## 4:         Catholic_above_median  0.10  -1.74
## 5: Infant.Mortality_above_median  0.04  -2.13

Simple OLS Model

Ok now that we’ve done some basic preprocessing and exploration, it’s time to start fitting models! Let’s begin with a simple one-variable OLS regression, estimated using the lm() command.

# 4. Simple regression: Fertility = f(Agriculture)
mod1 <- lm(Fertility ~ Agriculture, data = swiss)

# model summary
summary(mod1)
## 
## Call:
## lm(formula = Fertility ~ Agriculture, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.471  -7.913  -2.121   9.400  24.858 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 58.84560    4.74172  12.410 2.25e-14 ***
## Agriculture  0.22157    0.08599   2.577   0.0143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.42 on 35 degrees of freedom
## Multiple R-squared:  0.1595, Adjusted R-squared:  0.1354 
## F-statistic:  6.64 on 1 and 35 DF,  p-value: 0.01435
# QQ plot (are the residuals normal?)
plot(mod1, which = 2)

Once you have the fitted model, you can pass it and some new data to predict() to generate predictions. I’ve also defined calculations of the mean absolute error, mean squared error, and mean absolute percent error, common error metrics used in regression problems.

# Check how well our model does on the held-out data
preds <- predict(mod1, newdata = swissTestDF)
errors <- swissTestDF$Fertility - preds

MAE   <- mean(abs(errors))
MSE   <- mean(errors^2)
MAPE  <- 100*mean(abs(errors)/swissTestDF$Fertility)

# Actuals vs. Preds plot
plot(
    x = swissTestDF$Fertility
    , y = preds
    , xlab = "Actual Fertility"
    , ylab = "Predicted Fertility"
    , main = "Predictions from a simple OLS model"
    , ylim = c(0, 100)
    , xlim = c(0, 100)
)
lines(x = 0:100, y = 0:100, col = "red")

Wrap Parts of Your Pipeline in Functions

The error metric calculations and plotting we just did seem general enough to apply to other models of this phenomenon. Since we could reuse the code for other models, we should just wrap it in a function so it’s easy to call downstream.

# 5. Before moving on...lets wrap that pipeline in a function
EvaluateModel <- function(model, testDF, modelName){
    # Check how well our model does on the held-out data
    preds <- predict(model, newdata = testDF)
    errors <- testDF$Fertility - preds

    MAE   <- mean(abs(errors))
    MSE   <- mean(errors^2)
    MAPE  <- 100 * mean(abs(errors) / testDF$Fertility)

    # Actuals vs. Preds plot
    plot(
        x = testDF$Fertility
        , y = preds
        , xlab = "Actual Fertility"
        , ylab = "Predicted Fertility"
        , main = paste0(modelName, ": predictions")
        , ylim = c(0, 100)
        , xlim = c(0, 100)
    )

    # Add a 45-degree line
    lines(x = 0:100, y = 0:100, col = "red")

    # add error metrics
    text(x = rep(5,3), y = c(72,80,88), labels = c("MAE: ", "MSE: ", "MAPE: "))
    text(x = rep(15,3), y = c(72,80,88), labels = round(c(MAE, MSE, MAPE), 2))

    # Give back the preds in case user wants to do something customized
    return(list(
        pred = preds
        , MAE = MAE
        , MSE = MSE
        , MAPE = MAPE
    ))
}

OLS with Multiple RHS Variables

To add more variables to a model in R, you have to use formula notation.

# Fit
mod2 <- lm(
    Fertility ~ Education + Infant.Mortality + Examination_above_median
    , data = swiss
)

# Predict + Evaluate
regPreds2 <- EvaluateModel(
    mod2
    , testDF = swissTestDF
    , modelName = "Expanded OLS"
)

Regression Trees

Linear regressions are powerful tools, but there are certain classes of complex relationships that they are unable to express and therefore unable to “learn” when trained on data.

Regression Trees are one mainstream tool used to fit complex functions. They recursively partition the dataset, trying to find “local” models of subsets of the data which, together, provide a better fit than the single “global” OLS model we’re used to fitting.

The details of tree-based learning are outside the scope of this class, but I encourage you to check out A Visual Introduction to Machine Learning to at least get the general intuition behind this and other ML techniques.

# Fit
treeMod <- rpart::rpart(Fertility ~ ., data = swiss)

# Evaluate
dtPreds <- EvaluateModel(
    treeMod
    , swissTestDF
    , "Decision Tree"
)

You can walk through the tree with this awesome package called rattle. This package can be super hard to build… if you run into any issues, there are lots of nice options available via the rpart.plot package.

# Plot the tree
rattle::fancyRpartPlot(
    treeMod
    , main = "Single Decision Tree"
)

Random Forests

For larger datasets than the one we’re working with in this exercise, you may find that a single decision tree is not expressive enough or that one strong signal overpowers all the other variables. To correct for this, many analysts will use a “forest” of decision trees. The implementation detaisl are again outside the scope of this course: see Leo Breiman’s excellent site if you’re curious in learning more about this algorithm.

I only mention the random forest here to demonstrate how easy it is to fit and predict with complex statistical models in R.

# Fit
rfMod <- randomForest::randomForest(
    Fertility ~ .
    , data = swiss
    , ntree = 50
    , do.trace = TRUE
)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##    1 |      432   248.82 |
##    2 |    375.4   216.21 |
##    3 |    293.6   169.08 |
##    4 |    208.2   119.93 |
##    5 |    202.7   116.74 |
##    6 |    206.6   119.00 |
##    7 |    164.9    94.98 |
##    8 |    159.9    92.10 |
##    9 |    156.5    90.13 |
##   10 |    154.2    88.79 |
##   11 |      151    86.94 |
##   12 |    145.5    83.79 |
##   13 |      148    85.22 |
##   14 |    143.6    82.72 |
##   15 |    137.9    79.41 |
##   16 |    131.8    75.89 |
##   17 |    130.7    75.28 |
##   18 |    129.4    74.54 |
##   19 |    125.6    72.36 |
##   20 |    125.4    72.24 |
##   21 |    120.9    69.61 |
##   22 |    120.3    69.28 |
##   23 |    121.9    70.19 |
##   24 |    119.4    68.79 |
##   25 |    118.4    68.22 |
##   26 |    116.3    66.98 |
##   27 |    117.3    67.54 |
##   28 |    117.1    67.45 |
##   29 |    115.5    66.52 |
##   30 |    113.9    65.58 |
##   31 |    109.2    62.91 |
##   32 |    108.9    62.72 |
##   33 |    104.9    60.42 |
##   34 |    105.9    60.99 |
##   35 |    106.1    61.11 |
##   36 |    107.5    61.91 |
##   37 |    105.4    60.69 |
##   38 |    105.7    60.85 |
##   39 |    104.1    59.96 |
##   40 |    102.6    59.06 |
##   41 |    102.7    59.14 |
##   42 |    102.1    58.81 |
##   43 |    102.4    58.97 |
##   44 |    100.8    58.05 |
##   45 |      101    58.18 |
##   46 |      101    58.19 |
##   47 |    100.3    57.76 |
##   48 |      100    57.62 |
##   49 |      101    58.19 |
##   50 |    101.6    58.51 |
# Evaluate
rfPreds <- EvaluateModel(rfMod, swissTestDF, "Random Forest (ntree=100)")

Model Evaluation

In a real project, at some point you will find yourself saying “ok, I have all these models. What should I actually use to generate predictions?”. This question was our motivation for holding out a test set at the beginning of this walkthrough. Accuracy when predicting on previously unseen data is a totally defensible measure for choosing the “best” model from a set of candidate models.

Let’s see how the models we trained in this exercise performed:

# Build a table showing performance of each model type on the holdout
compDF <- data.frame(
    modelName = c("Expanded OLS", "Decision Tree", "Random Forest (ntree=100)")
    , MAE     = c(regPreds2$MAE, dtPreds$MAE, rfPreds$MAE)
    , MSE     = c(regPreds2$MSE, dtPreds$MSE, rfPreds$MSE)
    , MAPE    = c(regPreds2$MAPE, dtPreds$MAPE, rfPreds$MAPE)
)
compDF
##                   modelName      MAE      MSE     MAPE
## 1              Expanded OLS 5.965018 43.31007 8.791261
## 2             Decision Tree 4.358667 31.96601 6.330566
## 3 Random Forest (ntree=100) 3.925778 22.75089 5.817378

Manipulating Data Frames

Columnwise combination with cbind

In situations where you have multiple data frames with the same rows but different columns, you can combine them column-wise with R’s cbind() command. Note that this command will only work if the two data frames to be joined have the same number of rows AND those rows refer to the same observation.

cbind = “column-wise bind”

# Consider the following examples:
dealerDF1 <- data.frame(
    dealer_id = c(1:20)
    , avg_price = runif(20, min = 15, max = 75)
    , avg_margin = runif(20, min = -0.1, max = 0.45)
)

dealerDF2 <- data.frame(
    dealer_id = c(1:20)
    , industry = sample(
        c("Mining", "Healthcare", "Robot Stuff")
        , size = 20
        , replace = TRUE
    )
)

# How can we combine the commercial info with dealer characteristics?
# One approach (if you know the rows represent the same customers) is to use
# cbind()
fullDF <- cbind(dealerDF1, dealerDF2)

Column Matching with merge

cbind() works in the limited situation where you have two data frames that can just be jammed together (same number of rows + rows line up). This doesn’t happen too often. However, it is VERY common in data science workflows to have two mismatched tables of data from different sources and to want to combine them by matching on one or more keys. Think JOIN in SQL or VLOOKUP in Excel. To perform this operation in R, you can use the merge() command.

# If you can't trust that things are identically ordered, use merge()
dealerDF1 <- data.frame(
    dealer_id = c(20:1)
    , avg_price = runif(20, min = 15, max = 75)
    , avg_margin = runif(20, min = -0.1, max = 0.45)
)

dealerDF2 <- data.frame(
    dealer_id = c(1:20)
    , industry = sample(
        c("Mining", "Healthcare", "Robot Stuff")
        , size = 20
        , replace = TRUE
    )
)

fullDF2 <- merge(dealerDF1, dealerDF2, by = "dealer_id")

Rowwise combination with rbind

So far we’ve talked about merging columns from different tables. But what if you want to merge rows? For example, imagine that you are a researcher in a lab studying some natural phenomenon. You may take multiple samples (measuring the same variables) and then want to put them together into a single data frame to build a model. For this case, we can use R’s rbind() function.

rbind = “row-wise bind”

# What if you have the same data for different observations?
sample1 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2016-01-01")
        , to = as.Date("2016-06-30")
        , length.out = 180
    )
    , value = rnorm(180, 5, 2.5)
    , sampleId = rep("sample1", 180)
)


sample2 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2016-07-01")
        , to = as.Date("2017-03-31")
        , length.out = 210
    )
    , value = rnorm(210, 5, 2.5)
    , sampleId = rep("sample2", 210)
)

fullDF <- rbind(sample1, sample2)

# Compare
dim(sample1)
## [1] 180   3
dim(sample2)
## [1] 210   3
dim(fullDF)
## [1] 390   3

Rowwise Combination of Many Tables with rbindlist

rbind() works as a strategy for combining two tables, but what if you have 5 tables? 10? 1000? For those situations, you should checkout rbindlist() from the {data.table} package.

# Define some data frames
sample1 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2016-01-01")
        , to = as.Date("2016-06-30")
        , length.out = 180
    )
    , value = rnorm(180, 5, 2.5)
)

sample2 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2016-07-01")
        , to = as.Date("2017-03-31")
        , length.out = 210
    )
    , value = rnorm(210, 5, 2.5)
)

sample3 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2017-04-01")
        , to = as.Date("2017-04-30")
        , length.out = 30
    )
    , value = rnorm(30, 5, 2.5)
)

sample4 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2017-05-01")
        , to = as.Date("2017-12-31")
        , length.out = 240
    )
    , value = rnorm(240, 5, 2.5)
)

# Load up data.table and rbind all the tables together into one
library(data.table)
fullDF <- data.table::rbindlist(
    list(sample1, sample2, sample3, sample4)
)
head(fullDF)
##     startTime    value
## 1: 2016-01-01 4.744666
## 2: 2016-01-02 2.909827
## 3: 2016-01-03 2.730999
## 4: 2016-01-04 3.989323
## 5: 2016-01-05 6.809396
## 6: 2016-01-06 5.435551
# What if we had one more table with a column none of the others have?
sample5 <- data.frame(
    startTime = seq.Date(
        from = as.Date("2013-05-01")
        , to = as.Date("2013-12-31")
        , length.out = 240
    )
    , value = rnorm(240, 5, 2.5)
    , temperature = runif(240, min = -20, max = 95)
)
fullDF2 <- data.table::rbindlist(
    list(sample1, sample2, sample3, sample4, sample5)
    , fill = TRUE
)
head(fullDF2)
##     startTime    value temperature
## 1: 2016-01-01 4.744666          NA
## 2: 2016-01-02 2.909827          NA
## 3: 2016-01-03 2.730999          NA
## 4: 2016-01-04 3.989323          NA
## 5: 2016-01-05 6.809396          NA
## 6: 2016-01-06 5.435551          NA

Software Principles

Getting Your Project Off the Ground

Ok so you have a business question, you’ve chosen your toolchain (presumably with R), and you have some data in hand. You sit down to write code and, well…


Don’t fear! In this section, we’re going to walk through the process of building a non-trivial script from scratch.

Step 1: Build a Comment Skeleton

Resist the urge to just start writing code. Investing a few minutes upfront in thinking through the structure of your code will pay off in a big way as the project evolves and grows more complicated. Trust me.

First, just write down the main things you want to do. In this exercise, we’re going to write some R code that can generate n-page “books” or random sentences in English.

#=== Write an R script that writes a book! ===#

# Function to create random sentences

# Function to create a "page" with n sentences

# Function to create a "book" with m pages

# Call the book function and create a 5-page book

Step 2: Define Functions

Next, fill in the high-level outline with slightly more specifics. Try to strategize about the functions you’ll need to implement and the individual steps that will have to happen inside each of those functions. This will probably change once you start writing code, but in my experience it’s always easier to have a plan and change it.

# Function to create random sentences
createSentence <- function(){

    # Define a list of nouns

    # Define a list of adjectives

    # Define a list of adverbs

    # Define a list of verbs

    # Define a list of articles

    # Define a list of prepositions

    # Choose randomly from each, construct a sentence of the form 
    # article-adjective-noun-adverb-verb-preposition

    # Return the sentence

} 

# Function to create a "page" with n sentences
createPage <- function(n){

    # Call the function to create pages n times

    # Paste the n results together into one string, separated by a period and a space.

    # Return that one string
}

# Function to create a "book" with m pages
createBook <- function(n_pages, sentences_per_page){

    # Call the function to create pages n_pages times

    # Return a list with 1 page per element
}

# Call the book function and create a 5-page book

Pirate Book Example

You’ll spend the most time on this and you will have to go through it many times before you’re feeling comfortable with the code. I’ve given one implementation below. The point here is not to show you how to create a book-writing app, but rather to demonstrate that this somewhat complicated task was made easier by breaking it into smaller, more manageable pieces.

# Function to create random sentences
createSentence <- function(){

    # Define a list of nouns
    nouns <- c("parrot", "giant squid", "whale", "captain", "first mate")

    # Define a list of adjectives
    adjectives <- c("hearty", "brave", "grimy", "tough", "swarthy")

    # Define a list of adverbs
    adverbs <- c("quickly", "carefully")

    # Define a list of verbs
    verbs <- c("scurried", "trundled", "rolled", "walked")

    # Define a list of articles
    articles <- c("a", "the")

    # Define a list of prepositions
    prepositions <- c("away", "below")

    # Construct a sentence of the form article-adjective-noun-adverb-verb.
    sentence <- paste(
        sample(articles, 1)
        , sample(adjectives, 1)
        , sample(nouns, 1)
        , sample(adverbs, 1)
        , sample(verbs, 1)
        , sample(prepositions, 1)
    )

    # Return the sentence
    return(sentence)
} 

# Function to create a "page" with n sentences
createPage <- function(n){

    # Call the function to create pages n times
    some_sentences <- sapply(
        X = 1:n
        , FUN = function(x){
            createSentence()
        }
    )

    # Paste the n results together into one string, separated by a period and a space.
    a_page <- paste(some_sentences, collapse = ". ")

    # Return that one string
    return(a_page)
}

# Function to create a "book" with m pages
createBook <- function(n_pages, sentences_per_page){

    # Call the function to create pages n_pages times
    a_book <- lapply(
        X = 1:n_pages
        , FUN = function(x){
            createPage(n = sentences_per_page)
        }
    )

    # Return a list with 1 page per element
    return(a_book)
}

# Call the book function and create a 5-page book
aPirateTale <- createBook(n_pages = 5, sentences_per_page = 10)
print(aPirateTale)
## [[1]]
## [1] "a hearty captain quickly scurried below. a brave first mate carefully scurried below. the brave giant squid carefully trundled away. a brave parrot carefully walked away. a hearty captain carefully walked below. the swarthy first mate quickly rolled away. the hearty captain carefully rolled away. a tough parrot quickly trundled away. a tough first mate quickly scurried away. a brave whale carefully rolled away"
## 
## [[2]]
## [1] "a swarthy captain carefully trundled below. the hearty whale carefully walked away. the hearty giant squid carefully trundled below. the grimy parrot quickly walked below. the brave parrot carefully rolled away. the tough captain quickly walked away. a tough giant squid carefully trundled away. the grimy giant squid quickly rolled below. the brave whale carefully trundled below. the grimy whale carefully rolled away"
## 
## [[3]]
## [1] "the brave first mate quickly rolled below. a brave whale carefully scurried below. the grimy captain carefully rolled below. a swarthy first mate quickly scurried below. the grimy first mate quickly walked below. a grimy giant squid carefully walked away. the swarthy first mate carefully walked away. the tough parrot carefully scurried away. a tough parrot quickly walked away. the hearty whale carefully scurried below"
## 
## [[4]]
## [1] "the hearty captain quickly scurried below. the grimy captain carefully rolled below. a hearty whale carefully rolled away. a tough captain carefully walked below. a hearty parrot carefully walked away. the swarthy parrot quickly trundled below. a grimy giant squid quickly scurried away. the tough parrot carefully rolled away. the hearty first mate carefully rolled below. the grimy parrot carefully rolled below"
## 
## [[5]]
## [1] "a hearty whale carefully trundled below. a tough parrot carefully rolled below. a tough whale quickly trundled below. a tough first mate quickly trundled away. the swarthy parrot carefully walked away. a brave parrot quickly walked below. the grimy whale carefully walked below. the grimy parrot quickly scurried below. the tough parrot carefully walked below. the grimy first mate quickly rolled below"

Text Processing

Common Preprocessing Steps

In this section, we’re going to revisit the Shakespeare corpus we looked at in Week 2 and implement a basic “keyword extraction” pipeline. Let’s start by loading it and doing some common string preprocessing on it.

# Grab a random 5000 lines from the corpus (skipping a bunch of that MIT text at the beginning)
shakespeareFile <- "https://ia800300.us.archive.org/5/items/shakespearessonn01041gut/wssnt10.txt"
bsText <- readLines(con = shakespeareFile, n = 10000)[5001:10000]

# 1 - Convert everything to lowercase
bsText <- tolower(bsText)

# 2 - Trim leading and trailing whitespace (e.g. change "   a" to "a")
bsText <- trimws(bsText)

# 3 - Remove empty lines
bsText <- bsText[sapply(bsText, nchar)>0]

Regular Expressions

After applying these basic steps, we’re going to want to do some more powerful things like removing or replacing text based on particular character patterns. It is time to enter the mystical world of regular expressions.

Regular expressions (used in many programming languages) offer a way to express complex pattern matching. Let’s work through some examples to demonstrate the properties.

# 4 - Remove punctuation
bsText <- gsub(
    pattern = ";|,|!|?|\\.|\\:|<|>|\\]|\\["
    , replacement = ""
    , x =  bsText
)

You can run sample(bsText, 10) and see that our data are looking cleaner…but we still have work to do! Next, let’s use regular expressions (commonly just called “regex”) to handle some other issues.

# 5 - split some common contractions into two words
bsText <- gsub("he('s)", "he is", bsText)
bsText <- gsub("'ll", " will", bsText)

# 6 - Change any numbers to __number__
bsText <- gsub("[:digit:]", "__number__", bsText)

Tokenization

The next task we need to acomplish is tokenizing our text, i.e. splitting lines and sentences into individual words. These individual words can then be used downstream to get build a language model and identify key terms.

# 7- Loop over the vector of lines, split on whitespace, create a list of data.frames
library(stringr)
library(data.table)
allWords <- lapply(
    X = bsText
    , FUN = function(thisLine){
        return(data.frame(words = str_split(thisLine, " ")[[1]]))
    }
)

# 8 - Put into a data.table and print that just to make sure it worked
wordDT <- data.table::rbindlist(allWords)
print(wordDT[1:10], row.names = FALSE)
##  words
##   <NA>
##   <NA>
##   <NA>
##   <NA>
##   <NA>
##   <NA>
##   <NA>
##   <NA>
##   <NA>
##   <NA>

Counting Words in Text

Now that our data are a bit cleaner, it’s time to try finding key terms! Broadly speaking, “key terms” in a body of text are those that more common in the text than they are in the language as a whole (e.g. “the” will never be a key word). We aren’t looking at any actual data on the distribution of words in Shakespeare-era English in this exercise, so we’ll just drop the top 20 words and call the next 20 “key”.

The data.table package makes this operation easy to carry out.

# 9 - Get Word counts and sort by those counts
wordCountDT <- wordDT[, .N, by = words]
data.table::setnames(
    wordCountDT
    , old = "N"
    , new = "word_count"
)
data.table::setorder(wordCountDT, -word_count)
print(wordCountDT[1:10], row.names = FALSE)
##  words word_count
##   <NA>       5000
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
##   <NA>         NA
key_words <- wordCountDT[21:40]

R Programming Best Practices

Scripting: Style Notes

Use this slide as a general reference of coding best practices.

  • Declare all the dependencies you need in a bunch of library() calls at the top of your script(s)
  • Set global variables (like file paths) in all-caps near the top of your script(s)
  • Use comments like #===== Section 1 - Do Stuff =====# to separate major sections of the code
  • Namespace any calls to external functions with :: (e.g. lubridate::ymd_hms())

For other practices to keep your code clean and readable, check out Hadley Wickham’s style guide for R.

My script below shows an example of a typical layout for professional-quality R code.

make_plot.R

#===== 1. Setup =====#

# Load dependencies
library(data.table)
library(quantmod)
library(rbokeh)
library(purrr)

# Set global params
FRED_SERIES <- "CPIAUCSL" # UNRATE
TITLE       <- "U.S. CPI" # "U.S. unemployment""
VAR_UNITS   <- "Index (1982-1984 = 100)" # "% of labor force"

#===== 2. Get Data =====#

# Get data from FRED
quantmod::getSymbols(Symbols = FRED_SERIES, src = 'FRED')
## [1] "CPIAUCSL"
# Put it in a data.table
dataDT <- data.table::data.table(
    get(FRED_SERIES)
    , keep.rownames = TRUE
)

#===== 3. Plot it! =====#

# Plot it!
rbokeh::figure(
    data = dataDT
    , title = TITLE
    , ylab = VAR_UNITS
    , xlab = "date"
    , width = 800
) %>% rbokeh::ly_lines(
    x = dataDT[,index]
    , y = dataDT[,get(FRED_SERIES)]
    , color = "blue"
  )

Creating R Packages

As I mentioned in the first two weeks of class, there are around 10,000 packages currently on CRAN but MANY more privately-created packages in use by academics, professional data science teams, and hobbyists.

I’d like to demystify the package through an exercise…we are going to build an R package from scratch here in class! Packages are a robust and effective way to share your code with others and to manage dependencies in your code.

check out this article on how Airbnb uses internal R packages

Introducing RquetteBB

We are going to create a package called RquetteBB, a package for fetching, cleaning, and visualizing Marquette basketball statistics.

Before we begin, be sure that you have a few of the mainstream package-building libraries in R. If you don’t have any of these in your R library, please install them:

  • remotes = R wrappers around some command-line tools used to build R packages. Also includes some utilities for installing packages from source code.
  • roxygen2 = provides a framework for turning specially-formatting comments in R functions into package documentation that you can call with ?
  • testthat = a popular library for running unit tests, essentially tests that your code works as it should
library(remotes)
library(roxygen2)
library(testthat)

Building a Package Skeleton

  • An R package requires a particular directory structure, as explained below:
    • DESCRIPTION = a file containing information on dependencies, authors, and other useful metadata
    • NAMESPACE = an auto-generated file that contains informaiton on the names of objects defined in your package and the names of objects from other packages that your package relies on
    • LICENSE = a text file with some statement of the license that applies to the package. Don’t stress…this can be informal and goofy.
    • R/ = a directory with all the “.R” functions that your package exports
    • tests/testthat = a directory with all the tests used to verify that your functions are working correctly
    • tests/testthat.R = a script that instructs R how to run your unit tests
    • man/ = a directory with all the documentation that accompanies your functions. The files in here are auto-generated by {roxygen}

You can do most of the setup with R code!

  1. set your working directory (with setwd()) to wherever you’d like the package to live.
repo_directory <- file.path(Sys.getenv("HOME"), "repos")
dir.create(
    repo_directory
    , showWarnings = FALSE
)
setwd(repo_directory)
  1. run package.skeleton() to create a basic package setup
# Create the package skeleton
package_name <- "RquetteBB"
package.skeleton(
     name = package_name
)

# Remove some unnecessary files
package_path <- file.path(repo_directory, "RquetteBB")
file.remove(file.path(package_path, "Read-and-delete-me"))
file.remove(file.path(package_path, "NAMESPACE"))
unlink(file.path(package_path, "data"), recursive = TRUE)
unlink(file.path(package_path, "man"), recursive = TRUE)
  1. open up DESCRIPTION in the text editor of choice
    • change version number to 0.1
    • put your information (first name, last name, email) in the Author section
    • put an informative description in Description:
    • change license to License: file LICENSE
    • put an informative Title in Title:
  2. create a file called LICENSE
license_txt <- "You can use my stuff but you better give me credit!"
license_file <- file.path(package_path, "LICENSE")
file.create(license_file)
write(license_txt, file = license_file)
  1. create the man/ directory to store documentation files
dir.create(file.path(package_path, "man"), showWarnings = FALSE)
  1. create the tests/ and tests/testthat directories
    • create a blank R script, tests/testthat.R
    • create a blank R script, tests/testthat/test-my_functions.R
test_path <- file.path(package_path, "tests", "testthat")
dir.create(
    test_path
    , showWarnings = FALSE
    , recursive = TRUE
)
file.create(file.path(package_path, "tests", "testthat.R"))
file.create(file.path(test_path, "test-my_functions.R"))
  1. create a blank R script, R/my_functions.R
file.create(file.path(package_path, "R", "my_functions.R"))

Alternatively, I’ve provided a shell script you can run to generate this package.

make_rquettebb_pkg.sh

# Create the package directory
mkdir -p ${HOME}/repos
cd ${HOME}/repos

# Create package skeleton
Rscript -e "package.skeleton(name = 'RquetteBB', list = c('LETTERS'))"
cd RquetteBB

# Remove unnecessary stuff
rm Read-and-delete-me
rm -rf data/
rm man/*.Rd
rm NAMESPACE

# Create all the required directories
mkdir -p R
mkdir -p tests/testthat

# Create all the required files
echo 'You can use this but you better cite me!\n' > LICENSE
touch R/my_functions.R
touch tests/testthat.R
touch tests/testthat/test-my_functions.R

Write Your First Function

Next, it’s time to add your first function to the package! Some of the items we discuss later (like editing the DESCRIPTION file) will really only make sense if we have a least one function in the package.

Open up R/my_functions.R. Let’s add a function called NamesToAbbreviations. We’ll use this function later on to create plot labels from players’ names.

RquetteBB/R/my_functions.R

#' @title Names to abbreviations
#' @name NamesToAbbreviations
#' @description This function takes a vector of names and returns a vector of corresponding initials.
NamesToAbbreviations <- function(nameVec){

    # Split on whitespace
    splitNames <- stringr::str_split(
        nameVec
        , pattern = " "
        , simplify = FALSE
    )

    # Grab the first letter of each name and combines
    abbrevs <- purrr::map_chr(splitNames, function(splitName){
        firstInitial <- substr(splitName[1], 1, 1)
        lastInitial  <- substr(splitName[2], 1, 1)
        return(paste0(firstInitial, lastInitial))
    })

    # Return the character vector of abbreviations
    return(abbrevs)
}

Install the Package

Now that we have a function in the package, we need to generate documentation for the package and install it! Do the following:

  1. Set your working directory to the package
package_path <- file.path(Sys.getenv("HOME"), "repos", "RquetteBB")
setwd(package_path)
  1. Run roxygen2::roxygenize(). This function will populate your NAMESPACE file and (if we had written any yet) generate documentation that users could call with ?
rxygen2::roxygenize()
  1. Run remotes::install_local() to install the package!
remotes::install_local(package_path, force = TRUE)
  1. Run rm(list=ls()) to clear your working environment. Restart your R session.
  2. Run library(RquetteBB) to load the package
  3. Run ?NamesToAbbreviations to see the documentation for that function

Congratulations on creating your first R package! Lots of work still to be done, but you now have a minimum viable product (MVP) of the RquetteBB package.

Document Your Functions with roxygen2

As you saw when we ran ?NamesToAbbreviations, the documentation for our package is looking pretty weak right now. In this section, you’ll learn how to document your functions with the popular {roxygen2} package.

install.packages("roxygen2")

Return to R/my_functions.R. Right above your function, add some comments like those shown below. Note that the use of an apostrophe (#') is intentional and important here.

RquetteBB/R/my_functions.R

#' @title Names to abbreviations
#' @name NamesToAbbreviations
#' @description This function takes a vector of names and returns a vector of corresponding initials.
#' @param nameVec A character vector of player names
#' @importFrom purrr map_chr
#' @importFrom stringr str_split
#' @export
NamesToAbbreviations <- function(nameVec){
#...

When you’ve added this information, save my_functions.R and re-run roxygen::roxygenize(). Now try running ?NamesToAbbreviations to see your documentation!

These metadata items are called “roxygen comments”. Lines in package R files that begin with #' are analyzed by the {roxygen2} package when you run roxygen2::roxygenize().

  • @title: A short description of what the function does
  • @name: The name of the function
  • @description: A longer description of what the function does
  • @param: An explanation of the purpose of a particular function arguments, the types of valid inputs to that function, and the consequences of passing different values to it. You should have one of these items per function argument.
  • @importFrom: A list of functions from an external library that your particular function needs. These are of the form @importFrom package_name func1 func2 func3
  • @export: Putting this in your roxygen comments tells roxygen to export this function to the package’s namespace (so users of your package can call it)

Next, open up DESCRIPTION in a text editor of your choice. We need to add these two new dependencies! Under the Depends: line, add an Imports: blocks like this:

RquetteBB/DESCRIPTION

Package: RquetteBB
Title: Marquette Basketball Data in R
Version: 0.1
Authors@R: person("James", "Lamb", email = "jaylamb20@gmail.com", role = c("aut", "cre"))
Description: This package can be used to pull, analyze, and visualize Marquette men's basketball data.
Depends: R (>= 3.3.2)
Imports:
    purrr,
    stringr
License: file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.0.2

There are four types of dependency declarations used in R package DESCRIPTION files.

  • Depends: This section holds all things that are checked when someone calls library() on your package. Typically this section will just be used to enforce a minimum version of R that users of your package must have. If you put R packages here, they will automatically be loaded with library() whenever someone loads your package.
  • Imports: This section is really important. Imports holds the name of all the packages that your package’s functions use. These packages will not be loaded by default when someone calls library(your_package) but they will be installed when someone tries to install your package.
  • Suggests: R packages that are used to support your package but aren’t needed to run the code. Examples that commonly end up here are {testthat} for writing unit tests and {rmarkdown} for building documentaiton PDFs.
  • LinkingTo: Irrelevant unless your package has C, C++, or Fortran code in it.

Unit Tests

In software development, it’s a best practice to write your code in modular, composible “units”. Units are essentially small, related pieces of functionality. It’s also common to write tests for these bit of code, commonly referred to as “unit tests”.

The main idea behind unit tests is to try to imagine every possible type of input that a user might try to feed to your function, and then to test that the function does what it should when given that input. These tests are tedious to write at first, but they help you to make chagnes rapidly in the future…you can experiment and innovate with confidence when you know that your tests will catch any mistakes you make!

The preferred unit testing framework in R is a package called {testthat}. The easiest way to learn this package is just to use it…let’s try it out!

install.packages("testthat")

Open tests/testthat/test-my_functions.R and add the following code.

RquetteBB/tests/testthat/test-my_functions.R

context("Testing RquetteBB functions")

#===== 1. NamesToAbbreviations
test_that("NamesToAbbreviations should return the expected output for simple vectors", {
    someNames <- c("Joe Buck", "Chris Berman", "Michelle Tafoya")
    expect_identical(NamesToAbbreviations(someNames), c("JB", "CB", "MT"))
    expect_true(class(NamesToAbbreviations(someNames)) == "numeric")
})

Save this file, the run testthat::test_dir("tests/testthat/") from the console. You should get something like this:

The text in white is the “context” you specified. The white dot signifies one passed tests. The number 1 indicates your first test failure. I you had 5 failures and 12 successes you may see somthing like .1...23...4...5...

It looks like we messed up our second test! We were testing that the result would be numeric, but this funciton will always return a character vector. Change the “numeric” to “character’” in your test file and re-run.

FWIW…the test we just wrote is called an “end-to-end” test. Basically, that means a “test where we didn’t do anything weird”. End-to-end tests are the most straightfoward, since they just involve testing that the funciton behaves as expected with the expected input type. Other tests (we may write some later) go out of their way to test behavior in particular weird situations.

Now that we’re using testthat, we should add it to the Suggests section in our DESCRIPTION file. Open up that file and add this under your Imports section.

RquetteBB/DESCRIPTION

Suggests:
    testthat

Add a Function to Scrape Marquette Basketball Data

Next, let’s add another function. This function, which we’ll call GetData, should go out to espn.com and scrape down player statistics for any specified season of Marquette basketball.

RquetteBB/R/my_functions.R

#' @title Get Marquette Men's Basketball Data
#' @name GetData
#' @description This function scrapes sports-reference.com and returns some 
#'              player statistics for Marquette's men's basketball team.
#' @param season String indicating which season you want to pull data for. 
#'               e.g. set to "2017" for 2016-17 season stats
#' @importFrom data.table data.table
#' @importFrom htmltab htmltab
#' @export
GetData <- function(season = "2018"){

    # Build URL
    queryURL <- paste0(
        "https://www.sports-reference.com/cbb/schools/marquette/"
        , season
        , ".html"
    )

    # Grab tables from college basketbal reference
    result <- htmltab::htmltab(
        queryURL
        , which = "//table[@id = 'per_game']"
    )

    # Convert to data.table
    statDT <- data.table::as.data.table(result)

    # Coerce all the columns but Player to numeric
    numCols <- setdiff(names(statDT), "Player")
    statDT <- cbind(
        statDT[, "Player"]
        , statDT[, lapply(.SD, as.numeric), .SDcols = numCols]
    )

    # Return the table
    return(statDT)
}

Now that we’ve added a new function, we need to re-do a few thing:

  1. Re-run roxygen2::roxygenize() to regenerate package documentaiton
  2. Add any new packages we’ve added to the DESCRIPTION file
    • for this, we need to add data.table and XML
  3. Write unit tests
    • Let’s just add one end-to-end test

RquetteBB/tests/testthat/test-my_functions.R

context("Testing RquetteBB functions")

#===== 1. NamesToAbbreviations

# End-to-end test
test_that("NamesToAbbreviations should return the expected output for simple vectors", {
    someNames <- c("Joe Buck", "Chris Berman", "Michelle Tafoya")
    expect_identical(NamesToAbbreviations(someNames), c("JB", "CB", "MT"))
    expect_true(class(NamesToAbbreviations(someNames)) == "character")
})

#===== 2. NamesToAbbreviations

# End-to-end test
test_that("GetData should return a data.table", {
    statDT <- GetData(season = "2018")
    expect_true("data.table" %in% class(statDT))
    expect_true("Player" %in% names(statDT))
})

Add One More Function to Plot Data

You’re doing great! We should add one more function to the package. This function, CompareOnStats(), should take a data table of season stats and create a scatterplot showing pairs of statistics for each player. For example, you could use this function to answer the question Did Marquette’s highest scorers also tend to be beter rebounders?.

You’re a pro at adding new functions now, so I won’t belabor the points about adding documentation, updating the DESCRIPTION file, etc.. Let’s jump straight into an implementation of the funciton:

RquetteBB/R/my_functions.R

#' @title Scatterplot Comparison of Player Stats
#' @name CompareOnStats
#' @description Given a data.table of Marquette men's basketball statistics, plot a bivariate
#'              scatterplot of player performance on the basis of two statistics
#' @import data.table
#' @importFrom graphics plot text
#' @param statDT A data.table of Marquette season stats, pulled with \code{\link[RquetteBB]{GetData}}
#' @param x_stat A string with the name of the statistic to plot on the x-axis
#' @param y_stat A string with the name of the statistic to plot on the y-axis
#' @param season String indicating which season you pull data for. Used to create plot title 
#' @param useInitials A boolean. If TRUE, player initials will be plotted. If FALSE, full names will be used.
#' @export
CompareOnStats <- function(statDT
                           , x_stat = "PTS"
                           , y_stat = "TRB"
                           , season
                           , useInitials = TRUE
){

    # Create a scatterplot
    plot(
        x = statDT[, get(x_stat)]
        , y = statDT[, get(y_stat)]
        , xlab = x_stat
        , ylab = y_stat
        , type = "p"
        , col = "white"
        , main = paste0(x_stat, " vs. ", y_stat, " (", season, ")")
    )

    # Grab abbreviations from player names if asked
    if (useInitials){
        namesToPlot <- NamesToAbbreviations(statDT[, Player])
    } else {
        namesToPlot <- statDT[, Player]
    }

    # Add those abbreviations to the plot
    text(
        x = statDT[, get(x_stat)]
        , y = statDT[, get(y_stat)]
        , labels = namesToPlot
    )

    return(invisible(NULL))
}

Try Using the Package

Congratulations on reaching the end of this tutorial. You’ve now created your first R package! We can use it to do some cool stuff.

# Load up our package
library(RquetteBB)

# Plot the 2002-2003 data
statsDT <- GetData(season = "2003")
CompareOnStats(
    statDT = statsDT
    , "PTS"
    , "TRB"
    , season = "2002-03"
    , useInitials = FALSE
)

# Plot the 2018-2019 data
statsDT <- GetData(season = "2019")
CompareOnStats(
    statDT = statsDT
    , "PTS"
    , "TRB"
    , season = "2018-19"
    , useInitials = FALSE
)